##########
# Crossing Borders: Petra cleaning
##########

library(here)
library(data.table)

source(here("code", "resources", "country_codes.R")) # get country codes for Petra/Statpop data

# The bfs numbers in the petra data don't clearly align with any vintage of municipality. So instead, just take the unique pairs of bfs and bfs19 numbers to just convert them to 2019 and then to aggregate.
cw_unique <- unique(cw[ , .(bfs, bfs19)])

# address three cases where there are multiple bfs19 numbers for a single bfs number.
cw_unique[bfs == 4511, bfs19 := 4511]
cw_unique[bfs == 4555, bfs19 := 4551]
cw_unique[bfs == 4915, bfs19 := 4506]

muni_organize_petra <- function(year){

  dat <- read_dta(here("data", "cleaning", "PETRA", paste0("STWB__ZE",year,".dta"))); setDT(dat)

  dat[ , west_eu := nation %in% Western_EU]
  dat[ , south_eu := nation %in% South_EU]
  dat[ , ce_eu := nation %in% CentralEast_EU]
  dat[ , former_yugo := nation %in% Former_Yugo]
  dat[ , turkey := nation %in% Turkey]
  dat[ , other := nation >= 300]
  dat[ , germany := nation %in% Germany]
  dat[ , france := nation %in% France]
  dat[ , austria := nation %in% Austria]
  dat[ , italy := nation %in% Italy]
  dat[ , liechtenstein := nation %in% Liechtenstein]
  dat[ , border:= nation %in% Border]

  issues <- setdiff_dw(dat$wohngemeinde, cw$bfs)$in_x_not_y

  if(length(issues) > 0){
    message(paste0("There are ", length(issues), " rogue BfS numbers. They are: ", paste0(issues, collapse = ", "), ". In ", year, ", ", dat[wohngemeinde %in% issues, .N], " immigrants lived in these places!" ))
  } else{
    message("No rogue BfS numbers in ", year, ".")
  }
  
  dat[cw_unique, on = c(wohngemeinde = "bfs"), bfs19 := bfs19]

  muni_dat <- dat[!is.na(bfs19)
    , .("west_eu" = sum(west_eu, na.rm = T),
        "south_eu" = sum(south_eu, na.rm = T),
        "ce_eu" = sum(ce_eu, na.rm = T),
        "former_yugo" = sum(former_yugo, na.rm = T),
        "turkey" = sum(turkey, na.rm = T),
        "other" = sum(other, na.rm = T),
        "germany" = sum(germany, na.rm = T),
        "france" = sum(france, na.rm = T),
        "italy" = sum(italy, na.rm = T),
        "austria" = sum(austria, na.rm = T),
        "liechtenstein" = sum(liechtenstein, na.rm = T),
        "border" = sum(border, na.rm = T),
        "all_imm" = .N)
    , by = bfs19]
  
  missing <- setdiff_dw(muni_dat$bfs19, unique(cw$bfs19))$in_y_not_x
  
  if(length(missing) > 0){
    message("The following ", length(missing), " municipalities are not found in PETRA in ", year,". Imputing them as having 0 immigrants: ", paste0(missing, collapse = ", "))
    
    replacement <- data.table(matrix(0L, nrow = length(missing), ncol = ncol(muni_dat)))
    setnames(replacement, colnames(muni_dat))
    set(replacement, j = "bfs19", value = missing)
    muni_dat <- rbind(muni_dat, replacement)
    
  }

  set(muni_dat, j = "year", value = year)

  return(muni_dat)
}

petra <- rbindlist(lapply(c(1990L:2009L), muni_organize_petra))

message(
  "You've created the following data.table:
  * petra: contains information on immigrants per municipality based on origin."
)

rm(Austria, Border, CentralEast_EU, Former_Yugo, France, Germany, Italy, Liechtenstein, South_EU, Turkey, Western_EU, muni_organize_petra, cw_unique)
